04_describe

WHO 2025 data on Tuberculosis, data description

In this script, it’s intended to give an explanation on data features, using different methods and graphs. The provided data description is based on three datasets:

Overall, the document aims to do some light-weight analysis and data exploration, prior to the heavy-weight analysis in the subsequent 2x analysis files.


Loading relevant libraries:

library(tidyverse) 
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)

#Access to the function for loading the datasets and to save them
source("99_proj_func.R")

Loading data:

Joined dataset, year 2024:

#Loading age and sex and risk group data (2024 only) - Joined and augmented version of the data: 

data_file          <- "03_aug_TB_age_sex.tsv"
TB_age_sex_joined  <- load_data(data_file)
Loading ../data/03_aug_TB_age_sex.tsv from local file…
#Display the data: 
slice_sample(TB_age_sex_joined, n=5)
# A tibble: 5 × 18
  country            year age_group sex   risk_factor TB_cases_best TB_cases_min
  <chr>             <dbl> <chr>     <chr> <chr>               <dbl>        <dbl>
1 Bahrain            2024 15-24     Fema… no risk fa…             1            1
2 Spain              2024 65+       Male  no risk fa…           530          170
3 South Africa       2024 15+       Fema… no risk fa…        101000        37000
4 Kyrgyzstan         2024 15+       Both  no risk fa…          8500         6300
5 United States of…  2024 5-9       Both  no risk fa…           130           67
# ℹ 11 more variables: TB_cases_max <dbl>, population_size <dbl>,
#   total_TB_cases_best <dbl>, total_TB_cases_min <dbl>,
#   total_TB_cases_max <dbl>, TB_cases_pr_100k_best <dbl>,
#   TB_cases_pr_100k_min <dbl>, TB_cases_pr_100k_max <dbl>,
#   total_TB_cases_pr_100k_best <dbl>, total_TB_cases_pr_100k_min <dbl>,
#   total_TB_cases_pr_100k_max <dbl>

Joined dataset, year 2015 to 2024:

#Loading age and sex and risk group data (2024 only) - Joined version of the data: 

data_file           <- "03_aug_TB_10_years.tsv"
TB_10_years_joined  <- load_data(data_file)
Loading ../data/03_aug_TB_10_years.tsv from local file…
#Display the data: 
slice_sample(TB_10_years_joined, n=5)
# A tibble: 5 × 75
  country   g_whoregion.x  year e_pop_num e_inc_100k e_inc_100k_lo e_inc_100k_hi
  <chr>     <chr>         <dbl>     <dbl>      <dbl>         <dbl>         <dbl>
1 Gabon     AFR            2023   2484780      380             232         733  
2 New Zeal… WPR            2020   5069891        7.3             6           8.9
3 Lesotho   AFR            2015   2104605      780             436        1220  
4 Democrat… AFR            2020  95989997      409             250         785  
5 Ecuador   AMR            2021  17682453       41              33          49  
# ℹ 68 more variables: e_inc_num <dbl>, e_inc_num_lo <dbl>, e_inc_num_hi <dbl>,
#   e_tbhiv_prct <dbl>, e_inc_tbhiv_100k <dbl>, e_inc_tbhiv_100k_lo <dbl>,
#   e_inc_tbhiv_100k_hi <dbl>, e_inc_tbhiv_num <dbl>, e_inc_tbhiv_num_lo <dbl>,
#   e_inc_tbhiv_num_hi <dbl>, e_mort_exc_tbhiv_100k <dbl>,
#   e_mort_exc_tbhiv_100k_lo <dbl>, e_mort_exc_tbhiv_100k_hi <dbl>,
#   e_mort_exc_tbhiv_num <dbl>, e_mort_exc_tbhiv_num_lo <dbl>,
#   e_mort_exc_tbhiv_num_hi <dbl>, e_mort_tbhiv_100k <dbl>, …

WHO TB burden estimates [>1Mb]:

#Loading TB burden estimates: 

data_file   <- "02_clean_TB_burden.tsv"
TB_burden   <- load_data(data_file)
Loading ../data/02_clean_TB_burden.tsv from local file…
#Display the data: 
slice_sample(TB_burden, n=5)
# A tibble: 5 × 45
  country     g_whoregion  year e_pop_num e_inc_100k e_inc_100k_lo e_inc_100k_hi
  <chr>       <chr>       <dbl>     <dbl>      <dbl>         <dbl>         <dbl>
1 Uganda      AFR          2022  47312720      198             119         297  
2 Senegal     AFR          2008  12048957      184             106         390  
3 Oman        EMR          2021   4500426        7.4             6           8.9
4 Russian Fe… EUR          2016 145778682       63              36          98  
5 Cameroon    AFR          2001  15309492      131              71         319  
# ℹ 38 more variables: e_inc_num <dbl>, e_inc_num_lo <dbl>, e_inc_num_hi <dbl>,
#   e_tbhiv_prct <dbl>, e_inc_tbhiv_100k <dbl>, e_inc_tbhiv_100k_lo <dbl>,
#   e_inc_tbhiv_100k_hi <dbl>, e_inc_tbhiv_num <dbl>, e_inc_tbhiv_num_lo <dbl>,
#   e_inc_tbhiv_num_hi <dbl>, e_mort_exc_tbhiv_100k <dbl>,
#   e_mort_exc_tbhiv_100k_lo <dbl>, e_mort_exc_tbhiv_100k_hi <dbl>,
#   e_mort_exc_tbhiv_num <dbl>, e_mort_exc_tbhiv_num_lo <dbl>,
#   e_mort_exc_tbhiv_num_hi <dbl>, e_mort_tbhiv_100k <dbl>, …

Dictionary:

#Loading the data dictionary: 

data_file   <- "01_load_dictionary.tsv"

TB_dictionary <- load_data(data_file)
Loading ../data/01_load_dictionary.tsv from local file…
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
slice_sample(TB_dictionary, n=5)
# A tibble: 5 × 4
  variable_name       dataset           code_list   definition                  
  <chr>               <chr>             <chr>       <chr>                       
1 private_sector_link UNHLM commitments 0=No; 1=yes Have links with the private…
2 lmis                Laboratories      <NA>        Number of sites which by th…
3 budget_cpp_tpt      Budget            <NA>        Average cost of drugs budge…
4 new_sp_fu           Notification      <NA>        New pulmonary smear positiv…
5 xdr_died            Outcomes          <NA>        Outcomes for XDR-TB cases: …

Data description:

*TB_age_sex_joined (2024) - Description:

This dataset contains the number of TB cases across different countries, categorized by age group and gender. It also includes cases per 100,000 population, enabling standardized and comparable analysis between countries.

slice_sample(TB_age_sex_joined, n=5)
# A tibble: 5 × 18
  country           year age_group sex    risk_factor TB_cases_best TB_cases_min
  <chr>            <dbl> <chr>     <chr>  <chr>               <dbl>        <dbl>
1 Marshall Islands  2024 25-34     Female no risk fa…             2            0
2 Bahamas           2024 55-64     Male   no risk fa…             8            3
3 Kuwait            2024 18+       Both   diabetes               71           44
4 Colombia          2024 5-9       Both   no risk fa…           170           91
5 Zimbabwe          2024 15-24     Both   no risk fa…          4300         1400
# ℹ 11 more variables: TB_cases_max <dbl>, population_size <dbl>,
#   total_TB_cases_best <dbl>, total_TB_cases_min <dbl>,
#   total_TB_cases_max <dbl>, TB_cases_pr_100k_best <dbl>,
#   TB_cases_pr_100k_min <dbl>, TB_cases_pr_100k_max <dbl>,
#   total_TB_cases_pr_100k_best <dbl>, total_TB_cases_pr_100k_min <dbl>,
#   total_TB_cases_pr_100k_max <dbl>

We can briefly explore the big differences between comparing countries based on their total TB cases vs comparing with TB cases pr. 100k citizens.

Scatter plot of the countries with the top 10 most TB cases in total:

#Getting the 10 countries with highest total amount of TB cases:  
top_10_countries <- 
  TB_age_sex_joined |>
  group_by(country) |>
  summarise(total_TB = first(total_TB_cases_best)) |>
  arrange(desc(total_TB)) |>
  slice_head(n = 10) |> 
  pull(country)

#Making a tibble for those countries, and plotting them: 
plot <- TB_age_sex_joined |>
  filter(country %in% top_10_countries) |>
  group_by(country) |>
  summarise(
    mean_best = sum(TB_cases_best, na.rm = TRUE), #Sum of TB cases for this country 
    min_val  = sum(TB_cases_min, na.rm = TRUE),
    max_val  = sum(TB_cases_max, na.rm = TRUE),
  ) |> 
  
  ggplot(aes(x = mean_best, y = fct_reorder(country, mean_best))) +
  #fct_reorder(country, mean_best) ensures that we order sort all countries,
  #based on mean_best value (descending order). 
  
  geom_point(size = 3, color = "orange") +
  geom_errorbarh(aes(xmin = min_val, xmax = max_val), height = 0.2) +
  labs(
    x = "TB cases\n(Best estimate with min/max)",
    y = "Country",
    title = "Top 10 Countries - Total TB cases, with error bars"
  ) +
  theme_minimal()

#Save it

ggsave(
  filename = "../results/04_1_top10_TB.png",   #Choose the folder + filename
  plot = plot,
  width = 8,       # inches
  height = 5,      # inches
  dpi = 300        # high quality
)

plot

Note that really large countries like China and India is part of this graph.

That is not to state that TB is not a problem in these countries, but it is to highlight that the TB intensity might not be as bad as you might think.

This will make sense once you glance at the following plot.

Scatter plot of the countries with the top 10 most TB cases pr. 100k citizens (standardized):

#Making an object for storing the top 10 countries with most TB cases: 
top_10_countries_100k <- TB_age_sex_joined |>
  group_by(country) |>
  summarise(
    total_TB_cases_pr_100k_best = first(total_TB_cases_pr_100k_best),
    #Note: The value is constant for each country, so we just use first()
    #in order to pick the first value (we want to reduce several rows  
    #to 1 row pr. country) 
    ) |>
  arrange(desc(total_TB_cases_pr_100k_best)) |>
  slice_head(n = 10) |>
  pull(country)

#Making a tibble for those countries, and plotting them: 
plot <- TB_age_sex_joined |>
  filter(country %in% top_10_countries_100k) |>
  group_by(country) |>
  summarise(
    mean_best = first(total_TB_cases_pr_100k_best),
    min_val  = first(total_TB_cases_pr_100k_min),
    max_val  = first(total_TB_cases_pr_100k_max),
  ) |> 
  
    ggplot(aes(x = mean_best, y = fct_reorder(country, mean_best))) +
  #fct_reorder(country, mean_best) ensures that we order sort all countries,
  #based on mean_best value (descending order). 
  
  geom_point(size = 3, color = "orange") +
  geom_errorbarh(aes(xmin = min_val, xmax = max_val), height = 0.2) +
  labs(
    x = "TB cases pr. 100k\n(Best estimate with min/max)",
    y = "Country",
    title = "Top 10 Countries - TB cases pr. 100k citizens, with error bars"
  ) +
  theme_minimal()

#Save it

ggsave(
  filename = "../results/04_2_top10_TB_100k.png",   #Choose the folder + filename
  plot = plot,
  width = 8,       # inches
  height = 5,      # inches
  dpi = 300        # high quality
)
plot

Would you look at that!

Except for the Philippines, none of these countries were part of the plot for the “top 10 total TB cases countries”.

It goes to show that the standardized TB cases pr. 100k of citizens might be a better measure for TB disease intensity of a country.

*TB_10_years_joined (2015-2024) - Description:

We combined three WHO datasets — TB burden estimates, MDR/RR-TB burden estimates, and TB infection in household contacts — into a single multi-country, multi-year panel covering approximately 10 years.

The merged dataset contains measures of TB incidence and mortality, MDR/RR-TB incidence, and estimated household infection rates for each country–year.

This integrated dataset allows us to describe global TB trends, compare drug-resistant and drug-sensitive TB, and evaluate household transmission indicators.

(Note: Multidrug-resistant tuberculosis (MDR-TB) is defined as disease due to Mycobacterium tuberculosis that is resistant to isoniazid (H) and rifampicin (R) with or without resistance to other drugs. RR - Rifampicin resistant).

slice_sample(TB_10_years_joined, n=5)
# A tibble: 5 × 75
  country   g_whoregion.x  year e_pop_num e_inc_100k e_inc_100k_lo e_inc_100k_hi
  <chr>     <chr>         <dbl>     <dbl>      <dbl>         <dbl>         <dbl>
1 Spain     EUR            2022  47828387        6.3           5.1           7.6
2 Portugal  EUR            2015  10370263       23            19            28  
3 Finland   EUR            2015   5479722        5.7           4.6           6.8
4 Rwanda    AFR            2018  12488000       64            45            87  
5 China, H… WPR            2020   7490233       75            61            91  
# ℹ 68 more variables: e_inc_num <dbl>, e_inc_num_lo <dbl>, e_inc_num_hi <dbl>,
#   e_tbhiv_prct <dbl>, e_inc_tbhiv_100k <dbl>, e_inc_tbhiv_100k_lo <dbl>,
#   e_inc_tbhiv_100k_hi <dbl>, e_inc_tbhiv_num <dbl>, e_inc_tbhiv_num_lo <dbl>,
#   e_inc_tbhiv_num_hi <dbl>, e_mort_exc_tbhiv_100k <dbl>,
#   e_mort_exc_tbhiv_100k_lo <dbl>, e_mort_exc_tbhiv_100k_hi <dbl>,
#   e_mort_exc_tbhiv_num <dbl>, e_mort_exc_tbhiv_num_lo <dbl>,
#   e_mort_exc_tbhiv_num_hi <dbl>, e_mort_tbhiv_100k <dbl>, …
df10 <- TB_10_years_joined #just shortening the name so we don't have to type so much

Definitions of all the variables that were not renamed:

df_colnames <- tibble(colname = colnames(df10))
df_colnames
# A tibble: 75 × 1
   colname      
   <chr>        
 1 country      
 2 g_whoregion.x
 3 year         
 4 e_pop_num    
 5 e_inc_100k   
 6 e_inc_100k_lo
 7 e_inc_100k_hi
 8 e_inc_num    
 9 e_inc_num_lo 
10 e_inc_num_hi 
# ℹ 65 more rows
dict_filtered <- TB_dictionary[c("variable_name", "definition")] |> 
   filter(variable_name %in% (df_colnames |> 
                                pull(colname)))
print(dict_filtered)
# A tibble: 43 × 2
   variable_name definition                                                     
   <chr>         <chr>                                                          
 1 country       Country or territory name                                      
 2 rr_new        Number of new bacteriologically confirmed pulmonary TB patient…
 3 c_newinc_100k Case notification rate, which is the total of new and relapse …
 4 cfr           Estimated TB case fatality ratio                               
 5 cfr_hi        Estimated TB case fatality ratio: high bound                   
 6 cfr_lo        Estimated TB case fatality ratio: low bound                    
 7 cfr_pct       Estimated TB case fatality ratio expressed as a percentage     
 8 cfr_pct_hi    Estimated TB case fatality ratio: high bound expressed as a pe…
 9 cfr_pct_lo    Estimated TB case fatality ratio: low bound expressed as a per…
10 e_inc_100k    Estimated incidence (all forms) per 100 000 population         
# ℹ 33 more rows

After considering the definitions, we can select only the important variables that hold the most descriptive information to get to know the data. We are selecting data per 100k to get the most comparable summary.

df10_selection <- df10 |> 
      select(
        country,
        year,
        case_fatality_ratio = cfr,
        incidence_100k = e_inc_100k,
        incidence_hiv_100k = e_inc_tbhiv_100k,
        mortality_100k = e_mort_100k,
        mortality_hiv_100k = e_mort_tbhiv_100k,
        mortality_nohiv_100k = e_mort_exc_tbhiv_100k,
        rr_treated,
        rr_incidence,
        rr_labconf_pulm,
        contacts_best = Contacts_best,
        preventive_tx = Preventive_Tx_Pct
)
df10_selection
# A tibble: 1,847 × 13
   country      year case_fatality_ratio incidence_100k incidence_hiv_100k
   <chr>       <dbl>               <dbl>          <dbl>              <dbl>
 1 Afghanistan  2015                0.22            200               0.05
 2 Afghanistan  2016                0.19            204               0.06
 3 Afghanistan  2017                0.18            209               0.06
 4 Afghanistan  2018                0.18            212               0.06
 5 Afghanistan  2019                0.17            213               0.06
 6 Afghanistan  2020                0.2             205               0.03
 7 Afghanistan  2021                0.2             206               0.01
 8 Afghanistan  2022                0.19            206               0.06
 9 Afghanistan  2023                0.19            204               0.05
10 Afghanistan  2024                0.19            203               0.08
# ℹ 1,837 more rows
# ℹ 8 more variables: mortality_100k <dbl>, mortality_hiv_100k <dbl>,
#   mortality_nohiv_100k <dbl>, rr_treated <dbl>, rr_incidence <dbl>,
#   rr_labconf_pulm <dbl>, contacts_best <dbl>, preventive_tx <dbl>

Summary statistics for main TB variables

df10_selection %>%
  reframe(across(where(is.numeric),
                   list(mean = mean, sd = sd),
                   na.rm = TRUE))
Warning: There was 1 warning in `reframe()`.
ℹ In argument: `across(where(is.numeric), list(mean = mean, sd = sd), na.rm =
  TRUE)`.
Caused by warning:
! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.

  # Previously
  across(a:b, mean, na.rm = TRUE)

  # Now
  across(a:b, \(x) mean(x, na.rm = TRUE))
# A tibble: 1 × 24
  year_mean year_sd case_fatality_ratio_mean case_fatality_ratio_sd
      <dbl>   <dbl>                    <dbl>                  <dbl>
1     2019.    2.86                    0.139                  0.103
# ℹ 20 more variables: incidence_100k_mean <dbl>, incidence_100k_sd <dbl>,
#   incidence_hiv_100k_mean <dbl>, incidence_hiv_100k_sd <dbl>,
#   mortality_100k_mean <dbl>, mortality_100k_sd <dbl>,
#   mortality_hiv_100k_mean <dbl>, mortality_hiv_100k_sd <dbl>,
#   mortality_nohiv_100k_mean <dbl>, mortality_nohiv_100k_sd <dbl>,
#   rr_treated_mean <dbl>, rr_treated_sd <dbl>, rr_incidence_mean <dbl>,
#   rr_incidence_sd <dbl>, rr_labconf_pulm_mean <dbl>, …

Summary table of NA values (of selected values for better clarity):

missing_summary <- df10_selection %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "n_missing") %>%
  arrange(desc(n_missing))
missing_summary
# A tibble: 13 × 2
   variable             n_missing
   <chr>                    <int>
 1 preventive_tx              661
 2 incidence_hiv_100k         133
 3 case_fatality_ratio        113
 4 mortality_hiv_100k         113
 5 mortality_nohiv_100k       113
 6 rr_treated                 113
 7 rr_incidence               113
 8 rr_labconf_pulm            113
 9 country                      0
10 year                         0
11 incidence_100k               0
12 mortality_100k               0
13 contacts_best                0

Heatmap which highlights where values are missing:

missing_long <- df10_selection%>%
  mutate(row = row_number()) %>%
  mutate(across(everything(), as.character)) %>%   #convert values to character
  pivot_longer(-row, names_to = "variable", values_to = "value") %>%
  mutate(is_missing = is.na(value) | value == "NA")


ggplot(missing_long, aes(x = row, y = variable, fill = is_missing)) +
  geom_tile() +
  scale_fill_manual(values = c("FALSE" = "white", "TRUE" = "black")) +
  labs(title = "Missing values - Heatmap",
       x = "Row",
       y = "Variable",
       fill = "Missing") +
  theme_minimal() +
  theme(axis.text.x = element_blank(),    # hide x labels 
        axis.ticks.x = element_blank())

Missing values per country:

#Countries with most missing values: 
missing_by_country <- df10_selection |>
  group_by(country) |>
  summarise(
    total_rows = n(),
    total_missing = sum(is.na(across(everything()))),
    prop_missing = total_missing / (total_rows * ncol(df10))
  ) |>
  arrange(desc(prop_missing)) |> 
  slice_head(n=10)

#Top 10 countries with most missing values:
df_top10 <- df10_selection |>
  filter(country %in% missing_by_country$country) |>
  mutate(row = row_number())

#Prepare for missing values heatmap: 
missing_long <- df_top10 |>
  mutate(across(-c(country, row), ~ is.na(.))) |>  # logical missing values
  pivot_longer(-c(country, row), names_to = "variable", values_to = "is_missing")

#Plot heatmap
ggplot(missing_long, aes(x = variable, y = row, fill = is_missing)) +
  geom_tile() +
  facet_wrap(~ country, scales = "free_y") +  # separate heatmap per country
  scale_fill_manual(values = c("FALSE" = "white", "TRUE" = "black")) +
  labs(
    title = "Missing values - Heatmap for countries with the top 10 most missing values",
    x = "Variable",
    y = "Row",
    fill = "Missing"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
    strip.text = element_text(face = "bold")
  )

Global average of trends over time:

df10_selection
# A tibble: 1,847 × 13
   country      year case_fatality_ratio incidence_100k incidence_hiv_100k
   <chr>       <dbl>               <dbl>          <dbl>              <dbl>
 1 Afghanistan  2015                0.22            200               0.05
 2 Afghanistan  2016                0.19            204               0.06
 3 Afghanistan  2017                0.18            209               0.06
 4 Afghanistan  2018                0.18            212               0.06
 5 Afghanistan  2019                0.17            213               0.06
 6 Afghanistan  2020                0.2             205               0.03
 7 Afghanistan  2021                0.2             206               0.01
 8 Afghanistan  2022                0.19            206               0.06
 9 Afghanistan  2023                0.19            204               0.05
10 Afghanistan  2024                0.19            203               0.08
# ℹ 1,837 more rows
# ℹ 8 more variables: mortality_100k <dbl>, mortality_hiv_100k <dbl>,
#   mortality_nohiv_100k <dbl>, rr_treated <dbl>, rr_incidence <dbl>,
#   rr_labconf_pulm <dbl>, contacts_best <dbl>, preventive_tx <dbl>
global_yearly_trends <- df10_selection |> 
  group_by(year) |> 
  summarise(
    incidence_mean = mean(incidence_100k, na.rm = TRUE),
    rr_incidence_mean = mean(rr_incidence, na.rm = TRUE),
    household_inf_mean = mean(contacts_best, na.rm = TRUE)
  )

global_yearly_trends
# A tibble: 10 × 4
    year incidence_mean rr_incidence_mean household_inf_mean
   <dbl>          <dbl>             <dbl>              <dbl>
 1  2015           133.             3422.             47988.
 2  2016           125.             3143.             49461.
 3  2017           118.             2883.             51400.
 4  2018           113.             2716.             51221.
 5  2019           111.             2600.             55363.
 6  2020           105.             2471.             46000.
 7  2021           103.             2381.             56204.
 8  2022           105.             2326.             65069.
 9  2023           104.             2360.             71849.
10  2024           108.             2344.             78278.

Prepare for plotting - to long shape and standardization, because variables are on different scales (per different number of people)

global_yearly_trends_long <- global_yearly_trends |> 
  mutate(across(
    -year,
    ~ (.x - mean(.x, na.rm = TRUE)) / sd(.x, na.rm = TRUE)
  )) |> 
  
  pivot_longer(
    cols = -year,
    names_to = "variable",
    values_to = "value"
  )

global_yearly_trends_long
# A tibble: 30 × 3
    year variable             value
   <dbl> <chr>                <dbl>
 1  2015 incidence_mean      2.04  
 2  2015 rr_incidence_mean   2.00  
 3  2015 household_inf_mean -0.855 
 4  2016 incidence_mean      1.24  
 5  2016 rr_incidence_mean   1.27  
 6  2016 household_inf_mean -0.719 
 7  2017 incidence_mean      0.583 
 8  2017 rr_incidence_mean   0.578 
 9  2017 household_inf_mean -0.541 
10  2018 incidence_mean      0.0213
# ℹ 20 more rows

Exploratory plots - Global average of TB incidence over time:

ggplot(global_yearly_trends_long, aes(x = year, y = value, color = variable)) +
  geom_line(size = 1.2) +
  geom_point(size = 2) +
  scale_x_continuous(breaks = unique(global_yearly_trends_long$year)) +
  labs(
    title = "Global Yearly Trends",
    x = "Year",
    y = "Value",
    color = "Indicator"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top"
  )
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

Interestingly, household contacts seemed to have dropped rapidly during Covid-19 quarantine (happened around april of 2020), only to increase rapidly in the time afterwards.